library(tidyverse)
library(knitr)
library(plotly) ; library(viridis) ; library(gridExtra) ; library(RColorBrewer)
library(Rtsne)
library(knitr)

Load and prepare data

Load dataset (preprocessing code in 19_10_11_create_dataset.Rmd)

clustering_selected = 'DynamicHybridMergedSmall'
print(paste0('Using clustering ', clustering_selected))
## [1] "Using clustering DynamicHybridMergedSmall"
# Dataset created with DynamicTreeMerged algorithm
dataset = read.csv(paste0('./../Data/Gandal/dataset_', clustering_selected, '.csv'), row.names=1)

Gene filtering:

  • Remove genes without cluster (Module=gray)

  • Remove genes with Module-Trait correlation missing

rm_cluster = dataset[is.na(dataset$MTcor),clustering_selected] %>% unique %>% as.character

print(paste0('Removing ', sum(dataset[,clustering_selected]=='gray'), ' genes without cluster'))
## [1] "Removing 178 genes without cluster"
print(paste0('Removing ', sum(is.na(dataset$MTcor)), ' genes belonging to module(s) without module-trait correlation: ',
             rm_cluster))
## [1] "Removing 1555 genes belonging to module(s) without module-trait correlation: #00BD64"
new_dataset = dataset %>% filter(dataset[,clustering_selected]!='gray' & !is.na(MTcor))

Variable changes:

  • Using Module Membership variables instead of binary module membership

  • Not including p-value variables

  • Including a new variable with the absolute value of GS

  • Removing information from gray module (unclassified genes) and any other module that did not have a Module-Trait value

  • Objective variable: Binary label indicating if it’s in the SFARI dataset or not (including score 6)

new_dataset = new_dataset %>% dplyr::select(-c(matches(paste('pval', clustering_selected,
                                               gsub('#','',rm_cluster), sep='|')), MMgray)) %>%
              mutate('absGS'=abs(GS), 'SFARI'=ifelse(gene.score=='None', FALSE, TRUE)) %>%
              dplyr::select(-gene.score)

rownames(new_dataset) = rownames(dataset)[!is.na(dataset$MTcor) & dataset[,clustering_selected]!='gray']

rm(rm_cluster)
original_dataset = dataset
dataset = new_dataset
print(paste0('The final dataset contains ', nrow(dataset), ' observations and ', ncol(dataset), ' variables.'))
## [1] "The final dataset contains 14867 observations and 38 variables."
rm(new_dataset)

Sample of the dataset

dataset %>% head %>% kable
MTcor GS MM.00C1A8 MM.4B9FFF MM.8195FF MM.C17FFF MM.C49A00 MM.D29300 MM.FF64B2 MM.FF699D MM.FB61D7 MM.00C0BB MM.53B400 MM.E88523 MM.00B0F6 MM.F365E6 MM.FD6F86 MM.00BBDD MM.00B70C MM.75AF00 MM.DE8C00 MM.B4A000 MM.F8766D MM.00B6EB MM.00BF7D MM.D774FD MM.A3A500 MM.F17E4F MM.00C094 MM.00A8FF MM.00BECD MM.A58AFF MM.E76BF3 MM.FF61C5 MM.00BB45 MM.8EAB00 absGS SFARI
ENSG00000000003 0.7601073 0.3827131 -0.3477130 -0.1449705 -0.1646246 -0.2829441 -0.3416614 -0.1752186 -0.1947862 0.0280602 -0.1139808 -0.1800823 -0.2265495 0.0824399 0.1910086 -0.0125141 -0.0148751 -0.1126751 -0.0522299 0.1702235 -0.0318971 0.5014482 0.3716079 0.1920288 0.3141084 0.3318810 0.4772117 0.4106499 0.1092023 0.1433862 0.1295224 -0.0098212 0.3358176 0.1393598 0.1434403 0.2296628 0.3827131 FALSE
ENSG00000000419 0.4150643 0.0361671 -0.0509414 -0.4181807 -0.3435142 -0.0950710 0.1271936 0.3731103 -0.1661217 -0.0683150 0.1835545 -0.2093919 -0.0438694 0.0019727 0.2711652 0.1487585 0.1426498 -0.1856460 -0.1161695 0.0769049 -0.0790248 -0.2102113 -0.1908434 -0.0142087 -0.0346279 -0.2307758 0.0273226 0.1300158 -0.0003214 -0.1399217 0.1449922 0.3286803 0.1418381 0.1323354 0.5218180 0.2613671 0.0361671 FALSE
ENSG00000000457 -0.9216750 -0.4005675 0.1874271 0.2806782 0.1875352 0.3747611 0.3393944 0.2716503 0.3227764 0.1136580 0.1760050 0.1146922 0.3195916 -0.2275923 -0.0641184 -0.0820587 0.0331912 0.1632690 -0.0397420 -0.1388166 -0.0426981 -0.0552223 -0.0629920 -0.1494477 -0.2855344 -0.2286590 -0.3060407 -0.2607013 -0.3293047 -0.1707641 -0.2804889 -0.0996501 -0.2359453 -0.1367952 -0.0969217 -0.2679999 0.4005675 FALSE
ENSG00000000460 -0.1165650 -0.2386668 0.0081068 0.0453039 -0.1677691 0.1659316 0.3632797 0.4758476 0.1413266 0.0073811 0.0232871 -0.0620729 0.0492475 0.1279434 0.2559762 0.1658171 0.2708015 0.0877830 -0.0212876 -0.1033743 -0.0918440 -0.2760330 -0.2148459 -0.1636790 -0.3609935 -0.4116075 -0.4156419 -0.2123721 -0.2596629 -0.3395212 -0.0682352 0.1426534 0.0297760 0.1965458 0.3086452 -0.2279076 0.2386668 FALSE
ENSG00000000971 0.8556942 0.6586707 -0.2759526 -0.3511587 -0.2780262 -0.3940097 -0.4692445 -0.2210057 -0.1077318 0.0231030 -0.0266761 -0.2443301 -0.2726795 0.0025340 0.0112294 0.2300797 -0.1711726 -0.3056525 0.0899908 -0.0559141 0.0961331 0.0052618 -0.1778786 0.2297739 0.1605946 0.2297970 0.4252786 0.2986677 0.8196572 0.4208318 0.5669501 0.0737886 0.1440107 0.2222202 0.2006217 0.0904164 0.6586707 FALSE
ENSG00000001036 0.7601073 0.3221038 -0.1361035 -0.5330410 -0.4449867 -0.5478868 -0.3157576 0.0951756 -0.3301118 -0.2001291 0.1974429 -0.1947359 -0.1959688 0.1137417 0.0583008 0.0552634 0.0148300 -0.4207842 -0.1377539 0.0398521 0.1730645 -0.0030739 0.2322611 0.0643902 -0.0654919 -0.2230414 0.1810246 0.5690748 0.0845592 0.0307338 0.3770384 0.5580772 0.5086895 0.4994903 0.5013863 0.4209643 0.3221038 FALSE

Objective variable distribution: Unbalanced labels

print(table(dataset$SFARI))
## 
## FALSE  TRUE 
## 14039   828
cat(paste0('\n',round(mean(dataset$SFARI)*100,2), '% of the observations are positive'))
## 
## 5.57% of the observations are positive

Visualisations

Visualising the variables

Chose the t-SNE algorithm because it preserves distances

The SFARI labels is still close to the absolute value of Gene Significance. This time the MM variables seem to be grouped in 3 clusters

tsne = dataset %>% t %>% Rtsne(perplexity=10)

plot_data = data.frame('ID'=colnames(dataset), 'C1'=tsne$Y[,1], 'C2'=tsne$Y[,2],
                       type=ifelse(grepl('MM', colnames(dataset)),'ModMembership',
                            ifelse(grepl('SFARI', colnames(dataset)), 'SFARI',
                            ifelse(grepl('GS', colnames(dataset)), 'GS', 'MTcor'))))

ggplotly(plot_data %>% ggplot(aes(C1, C2, color=type)) + geom_point(aes(id=ID)) + 
         theme_minimal() + ggtitle('t-SNE visualisation of variables'))

The Module Membership variables are grouped by Module-Trait correlation, with positive correlations on one side, negative on the other, and modules with low correlation far away from the SFARI tag

mtcor_by_module = original_dataset %>% dplyr::select(matches(clustering_selected), MTcor) %>% unique
colnames(mtcor_by_module) = c('ID','MTcor')

plot_data = mtcor_by_module %>% mutate(ID = gsub('#','MM.',ID)) %>% right_join(plot_data, by='ID')

ggplotly(plot_data %>% ggplot(aes(C1, C2, color=MTcor)) + geom_point(aes(id=ID)) + 
         scale_color_viridis() + theme_minimal() + 
         ggtitle('t-SNE of variables coloured by Module-Diagnosis correlation'))
rm(mtcor_by_module, tsne)

Visualising the observations

# Mean Expression data
load('./../Data/Gandal/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
mean_expr = data.frame('ID'=rownames(datExpr), 'meanExpr' = rowMeans(datExpr))

# PCA
pca = dataset %>% t %>% prcomp

plot_data = data.frame('ID'=rownames(dataset), 'PC1'=pca$rotation[,1], 'PC2'=pca$rotation[,2], 
                       'SFARI'=dataset$SFARI, 'MTcor'=dataset$MTcor, 'GS'=dataset$GS) %>%
            mutate(alpha=ifelse(SFARI, 0.7, 0.2)) %>% left_join(mean_expr, by='ID')

p1 = plot_data %>% ggplot(aes(PC1, PC2, color=MTcor)) + geom_point(alpha=0.7) + scale_color_viridis() + 
     theme_minimal() + ggtitle('Genes coloured by Module-Diagnosis correlation') +
     xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1]),'%)')) +
     ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2]),'%)')) +
     theme(legend.position='bottom')

p2 = plot_data %>% ggplot(aes(PC1, PC2, color=GS)) + geom_point(alpha=0.5) + scale_color_viridis() + 
     theme_minimal() + ggtitle('Genes coloured by Gene Significance') + theme(legend.position='bottom')

p3 = plot_data %>% ggplot(aes(PC1, PC2, color=SFARI)) + geom_point(aes(alpha=alpha)) +
     theme_minimal() + ggtitle('Genes coloured by SFARI label') + theme(legend.position='bottom')
p3 = ggExtra::ggMarginal(p3, type='density', groupColour=TRUE, size=10)

p4 = plot_data %>% ggplot(aes(PC1, PC2, color=meanExpr)) + geom_point(alpha=0.5) + scale_color_viridis() + 
     theme_minimal() + ggtitle('Genes coloured by mean level of expression') + theme(legend.position='bottom')

grid.arrange(p1, p2, p3, p4, nrow=2)

rm(pca, datExpr, datGenes, datMeta, dds, DE_info, p1, p2, p3, p4)

Resampling to reduce class imbalance

Should check SMOTE (Synthetic Minority Over-sampling Technique) method for later

Aiming for 1:2 ratio in labels (a bit arbitrary, but I was thinking that 1/3-2/3 doesn’t sound as imbalanced and this way I don’t have to remove that many genes or over-sample so much as with a 1:1 relation)

Under-sampling observations with negative SFARI labels: Keep 1/3 of the negative class

negative_obs = which(!dataset$SFARI)
remove_obs = sample(negative_obs, size=floor(length(negative_obs)*2/3))

dataset = dataset[-remove_obs,]

Over-sampling observations with positive SFARI label: Sample with replacement 3x original number of observations

Need to divide first into train and test sets to keep the sets independent

set.seed(123)
train_idx = sample(1:nrow(dataset), size=floor(0.8*nrow(dataset)))
train_set = dataset[train_idx,]
test_set = dataset[-train_idx,]

Sample with replacement positive observations in train set

positive_obs = which(train_set$SFARI)
add_obs = sample(positive_obs, size=2*length(positive_obs), replace=TRUE)

train_set = train_set[c(1:nrow(train_set), add_obs),]
table(train_set$SFARI)
## 
## FALSE  TRUE 
##  3749  1971

Logistic Regression

Train model

library(ROCR)
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
train_set$SFARI = train_set$SFARI %>% as.factor

fit = glm(SFARI~., data=train_set, family='binomial')
test_set$pred = predict(fit, newdata=test_set, type='response')

Test model

Using AUC instead of accuracy to measure the performance of the model because of the imbalance between classes

pred_ROCR = prediction(test_set$pred, test_set$SFARI)
roc_ROCR = performance(pred_ROCR, measure='tpr', x.measure='fpr')
AUC = performance(pred_ROCR, measure='auc')@y.values[[1]]

plot(roc_ROCR, main=paste0('ROC curve (AUC=',round(AUC,2),')'), col='#009999')
abline(a=0, b=1, col='#666666')

lift_ROCR = performance(pred_ROCR, measure='lift', x.measure='rpp')
plot(lift_ROCR, main='Lift curve', col='#86b300')

Analyse model

SFARI genes have a slightly higher distribution than the rest

plot_data = test_set %>% dplyr::select(pred, SFARI)

plot_data %>% ggplot(aes(pred, fill=SFARI, color=SFARI)) + geom_density(alpha=0.3) + 
              theme_minimal() + ggtitle('Predicted score distribution by SFARI label')

Gene significance doesn’t seem to be relevant for the model

summary(fit)
## 
## Call:
## glm(formula = SFARI ~ ., family = "binomial", data = train_set)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.7996  -0.8921  -0.6353   1.1140   2.4638  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.03145    0.05828 -17.698  < 2e-16 ***
## MTcor        -0.05326    0.08141  -0.654  0.51296    
## GS           -1.59267    1.01431  -1.570  0.11637    
## MM.00C1A8    -0.40755    0.81011  -0.503  0.61490    
## MM.4B9FFF     1.35449    2.23466   0.606  0.54443    
## MM.8195FF    -2.55217    2.37102  -1.076  0.28175    
## MM.C17FFF     5.33821    2.94619   1.812  0.07000 .  
## MM.C49A00   -10.08225    3.28362  -3.070  0.00214 ** 
## MM.D29300    -0.11601    2.26198  -0.051  0.95910    
## MM.FF64B2    -3.42565    1.55313  -2.206  0.02741 *  
## MM.FF699D    -0.01221    0.90475  -0.013  0.98923    
## MM.FB61D7     4.78697    2.15352   2.223  0.02622 *  
## MM.00C0BB    -3.49686    2.95924  -1.182  0.23733    
## MM.53B400     5.68630    3.39267   1.676  0.09373 .  
## MM.E88523     2.30670    0.67810   3.402  0.00067 ***
## MM.00B0F6     0.74557    1.18783   0.628  0.53022    
## MM.F365E6    -2.78045    3.25272  -0.855  0.39266    
## MM.FD6F86     6.11707    4.16962   1.467  0.14236    
## MM.00BBDD     2.28886    2.27628   1.006  0.31464    
## MM.00B70C     1.59202    3.00491   0.530  0.59625    
## MM.75AF00    -2.26124    1.07989  -2.094  0.03626 *  
## MM.DE8C00    -0.37769    0.95407  -0.396  0.69220    
## MM.B4A000     2.95619    1.66353   1.777  0.07556 .  
## MM.F8766D    -3.58622    1.44402  -2.483  0.01301 *  
## MM.00B6EB     0.55348    1.11330   0.497  0.61908    
## MM.00BF7D     3.46299    1.68429   2.056  0.03978 *  
## MM.D774FD    -1.00867    1.99616  -0.505  0.61335    
## MM.A3A500    -3.84982    1.99995  -1.925  0.05424 .  
## MM.F17E4F     4.17438    3.19947   1.305  0.19199    
## MM.00C094     2.86735    1.57236   1.824  0.06821 .  
## MM.00A8FF    -5.46096    2.87844  -1.897  0.05780 .  
## MM.00BECD     3.07719    1.89615   1.623  0.10462    
## MM.A58AFF     2.32565    1.64045   1.418  0.15628    
## MM.E76BF3    -0.62129    2.89972  -0.214  0.83035    
## MM.FF61C5    -0.78324    1.46051  -0.536  0.59177    
## MM.00BB45    -6.82551    2.58401  -2.641  0.00826 ** 
## MM.8EAB00    -2.25599    1.88154  -1.199  0.23052    
## absGS         0.36423    0.15628   2.331  0.01977 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 7367.7  on 5719  degrees of freedom
## Residual deviance: 6635.1  on 5682  degrees of freedom
## AIC: 6711.1
## 
## Number of Fisher Scoring iterations: 3

There doesn’t seem to be a relation between the modules’ coefficient and their correlation with Diagnosis, although there is correlation between the variables, so it’s not correct to analyse their coefficients when this happens

Also, this results changes a lot between different runs

var_fit_info = summary(fit)$coefficients %>% as.data.frame %>% 
               mutate(signif=`Pr(>|z|)`<0.05, ID=rownames(summary(fit)$coefficients))

plot_data = original_dataset %>% dplyr::select(matches(clustering_selected), MTcor) 
colnames(plot_data) = c('ID','MTcor')

plot_data = plot_data %>% mutate(ID=gsub('#','MM.',ID)) %>% group_by(ID, MTcor) %>% 
            tally %>% inner_join(var_fit_info, by='ID')

ggplotly(plot_data %>% ggplot(aes(Estimate, MTcor, color=signif, size=n)) + geom_point(aes(id=ID)) + 
         geom_smooth(method='lm', se=FALSE) + theme_minimal() +
         xlab('Coefficient in regression') + ylab('Module-Diagnosis correlation'))
rm(var_fit_info)

Strong correlations between variables

cors = cor(train_set[,-ncol(train_set)])
heatmap.2(cors, dendrogram='none', col=brewer.pal(11,'RdBu'), scale='none', trace='none')

Print genes with highest scores in test set

test_set %>% dplyr::select(pred, SFARI) %>% mutate(ID = rownames(test_set)) %>% 
             arrange(desc(pred)) %>% top_n(50, wt=pred) %>% dplyr::select(ID, SFARI, pred) %>%
             kable
ID SFARI pred
ENSG00000145362 TRUE 0.8485268
ENSG00000021645 TRUE 0.8131970
ENSG00000100393 TRUE 0.8121960
ENSG00000170396 TRUE 0.8112124
ENSG00000174469 TRUE 0.7869432
ENSG00000163625 TRUE 0.7784175
ENSG00000126883 FALSE 0.7736464
ENSG00000091656 FALSE 0.7649464
ENSG00000157064 FALSE 0.7613582
ENSG00000170579 TRUE 0.7572440
ENSG00000116698 FALSE 0.7512478
ENSG00000123908 FALSE 0.7485797
ENSG00000196876 TRUE 0.7435254
ENSG00000170448 FALSE 0.7427600
ENSG00000086205 TRUE 0.7340385
ENSG00000078804 FALSE 0.7292197
ENSG00000198646 FALSE 0.7223111
ENSG00000178385 FALSE 0.7177402
ENSG00000119866 TRUE 0.7109069
ENSG00000145016 FALSE 0.7077317
ENSG00000143569 FALSE 0.7040398
ENSG00000163171 FALSE 0.7028713
ENSG00000187605 FALSE 0.7022734
ENSG00000163637 TRUE 0.7011450
ENSG00000159140 FALSE 0.6936414
ENSG00000087460 TRUE 0.6907060
ENSG00000196090 TRUE 0.6866897
ENSG00000144406 TRUE 0.6855405
ENSG00000056661 FALSE 0.6850533
ENSG00000022840 FALSE 0.6833950
ENSG00000197102 TRUE 0.6827331
ENSG00000085433 FALSE 0.6738892
ENSG00000143079 FALSE 0.6713677
ENSG00000132326 TRUE 0.6695043
ENSG00000122882 FALSE 0.6646459
ENSG00000073803 FALSE 0.6617198
ENSG00000149294 FALSE 0.6592623
ENSG00000076356 FALSE 0.6573431
ENSG00000101126 TRUE 0.6558775
ENSG00000081479 TRUE 0.6546380
ENSG00000215421 FALSE 0.6534893
ENSG00000187391 FALSE 0.6531410
ENSG00000185499 FALSE 0.6492286
ENSG00000114541 FALSE 0.6412666
ENSG00000078018 TRUE 0.6386647
ENSG00000164506 TRUE 0.6368599
ENSG00000115568 FALSE 0.6298432
ENSG00000143507 FALSE 0.6296557
ENSG00000171988 TRUE 0.6291181
ENSG00000048471 FALSE 0.6279838